import pandas as pd
import numpy as np
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn import svm
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
df = pd.read_csv(r"cars.csv", header=None)
df.columns=['buying','maint','doors','persons','lug_boot','safety','classes']
df.head()
| buying | maint | doors | persons | lug_boot | safety | classes | |
|---|---|---|---|---|---|---|---|
| 0 | vhigh | vhigh | 2 | 2 | small | low | unacc |
| 1 | vhigh | vhigh | 2 | 2 | small | med | unacc |
| 2 | vhigh | vhigh | 2 | 2 | small | high | unacc |
| 3 | vhigh | vhigh | 2 | 2 | med | low | unacc |
| 4 | vhigh | vhigh | 2 | 2 | med | med | unacc |
df.shape
(1728, 7)
# check category level of all columns
for col in df.columns:
print("----------------{}-----------------------".format(col))
print(df[col].value_counts())
print()
----------------buying----------------------- buying vhigh 432 high 432 med 432 low 432 Name: count, dtype: int64 ----------------maint----------------------- maint vhigh 432 high 432 med 432 low 432 Name: count, dtype: int64 ----------------doors----------------------- doors 2 432 3 432 4 432 5more 432 Name: count, dtype: int64 ----------------persons----------------------- persons 2 576 4 576 more 576 Name: count, dtype: int64 ----------------lug_boot----------------------- lug_boot small 576 med 576 big 576 Name: count, dtype: int64 ----------------safety----------------------- safety low 576 med 576 high 576 Name: count, dtype: int64 ----------------classes----------------------- classes unacc 1210 acc 384 good 69 vgood 65 Name: count, dtype: int64
in the above values counts we can see in two column we have category and numbers so we need to replace the categories with numbers using label encoder
# check missing values
df.isnull().sum()
buying 0 maint 0 doors 0 persons 0 lug_boot 0 safety 0 classes 0 dtype: int64
in the above data we can see there is no missing values
# check basic info
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1728 entries, 0 to 1727 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 buying 1728 non-null object 1 maint 1728 non-null object 2 doors 1728 non-null object 3 persons 1728 non-null object 4 lug_boot 1728 non-null object 5 safety 1728 non-null object 6 classes 1728 non-null object dtypes: object(7) memory usage: 94.6+ KB
we need to perform label encoder on all the columns bcoz all the columns contain object data type
# apply label encoder to convert categories into numeric values
le = LabelEncoder()
for x in df.columns:
df[x]=le.fit_transform(df[x])
le_name = dict(zip(le.classes_, le.transform(le.classes_)))
print("--------------------------------------------------")
print("Feature: ", x)
print("Mapping: ", le_name)
--------------------------------------------------
Feature: buying
Mapping: {'high': 0, 'low': 1, 'med': 2, 'vhigh': 3}
--------------------------------------------------
Feature: maint
Mapping: {'high': 0, 'low': 1, 'med': 2, 'vhigh': 3}
--------------------------------------------------
Feature: doors
Mapping: {'2': 0, '3': 1, '4': 2, '5more': 3}
--------------------------------------------------
Feature: persons
Mapping: {'2': 0, '4': 1, 'more': 2}
--------------------------------------------------
Feature: lug_boot
Mapping: {'big': 0, 'med': 1, 'small': 2}
--------------------------------------------------
Feature: safety
Mapping: {'high': 0, 'low': 1, 'med': 2}
--------------------------------------------------
Feature: classes
Mapping: {'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3}
df.head()
| buying | maint | doors | persons | lug_boot | safety | classes | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 3 | 0 | 0 | 2 | 1 | 2 |
| 1 | 3 | 3 | 0 | 0 | 2 | 2 | 2 |
| 2 | 3 | 3 | 0 | 0 | 2 | 0 | 2 |
| 3 | 3 | 3 | 0 | 0 | 1 | 1 | 2 |
| 4 | 3 | 3 | 0 | 0 | 1 | 2 | 2 |
X = df.iloc[ : , :-1] # independent variables
y = df.iloc[ : , -1] # dependent variable
# Split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.3,
random_state=10)
# verify spliting data
print("Total Data: ", df.shape)
print("Train X: ", X_train.shape)
print("Train y: ", y_train.shape)
print("Test X: ", X_test.shape)
print("Test y: ", y_test.shape)
Total Data: (1728, 7) Train X: (1209, 6) Train y: (1209,) Test X: (519, 6) Test y: (519,)
# perform data normalisation
# create object for scaler
scaler = MinMaxScaler()
# fit the train data on scaler object
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)
Note: In this data set we can see all the values from all the columns and rows are ranging from 0 to 5 so there is no extreme high and low values so thats why we dont need to perfom standardsiation and normalisation
# compare normal and scale data
print(X_train.iloc[0])
print(X_train_scale[0])
buying 0 maint 0 doors 1 persons 2 lug_boot 0 safety 0 Name: 593, dtype: int32 [0. 0. 0.33333333 1. 0. 0. ]
# create model object with default parameter
model_dt = DecisionTreeClassifier()
# create model object with custom parameter
# model_dt = DecisionTreeClassifier(criterion='gini',
# random_state=10,
# min_samples_leaf=5,
# min_samples_split=20,
# max_leaf_nodes=15,
# max_depth=6)
#fit the model on the data and predict the values
model_dt.fit(X_train, y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
model_dt.get_depth()
14
# prediction on Test Data
y_pred= model_dt.predict(X_test)
print(list(zip(y_test,y_pred)))
[(2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (1, 1), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (1, 1), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (3, 3), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (1, 1), (1, 1), (2, 2), (2, 2), (2, 2), (0, 0), (0, 2), (2, 2), (0, 0), (3, 3), (3, 3), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (3, 3), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (1, 0), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (3, 3), (0, 0), (0, 0), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (1, 1), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (1, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0)]
# confusion matrix {'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3}
print("---------Confusion Matrix------------")
print(confusion_matrix(y_test, y_pred))
print()
print("---------Accuracy Score------------")
print(accuracy_score(y_test,y_pred))
print()
print("---------Classification Report------------")
print(classification_report(y_test,y_pred))
---------Confusion Matrix------------
[[101 0 1 0]
[ 2 19 0 0]
[ 0 0 371 0]
[ 1 0 0 24]]
---------Accuracy Score------------
0.9922928709055877
---------Classification Report------------
precision recall f1-score support
0 0.97 0.99 0.98 102
1 1.00 0.90 0.95 21
2 1.00 1.00 1.00 371
3 1.00 0.96 0.98 25
accuracy 0.99 519
macro avg 0.99 0.96 0.98 519
weighted avg 0.99 0.99 0.99 519
# feature importance for all columns
# total will be 1
print(list(zip(df.columns, model_dt.feature_importances_)))
[('buying', 0.1459054364730725), ('maint', 0.2558302984019575), ('doors', 0.05823344633319904), ('persons', 0.19534995691234955), ('lug_boot', 0.09892620952419463), ('safety', 0.2457546523552268)]
# from sklearn import tree
# import graphviz
# with open("model_DecisionTree.txt", "w") as f:
# f = tree.export_graphviz(model_dt, feature_names=cars_df.columns[:-1], out_file=f)
# # generate the file and upload the code in webgraphviz.com to plot the decision tree
df.columns[:-1]
Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], dtype='object')
%%time
# it will take 5 to 6 mins to create image so wait.....
fig = plt.figure(figsize=(500,400))
dot_data = tree.plot_tree(model_dt,
feature_names=list(df.columns[:-1]),
class_names=list(df.columns[-1]),
filled=True)
plt.savefig("base_model.jpeg")
CPU times: total: 18.9 s Wall time: 2min 26s